IMPORTANT:

initial trajectory is important for final solution, with good initial trajectory it finds solution really easy. This suggests to use the old trajectory initial input to iLQG algorithm


In [3]:
import numpy as np
from drl.ilqg import ilqg, LearnedDynamics
from drl.env.arm import TwoLinkArm

env = TwoLinkArm(g=0., wp=10., wv=1., wu=0.001)

N = 5 # number of future steps for iLQG
Nf = 2 # number of time-steps ahead and after current time-step for fitting linear model
num_episodes = 25
max_steps = 75

full_state = True

model = LearnedDynamics(max_steps, num_episodes, env.state_dim, env.action_dim, Nf)

In [4]:
x = env.reset(full_state=full_state)
x0 = env.q
goal = env.goal

# Initialize random control sequence
u = np.random.randn(max_steps, env.action_dim)

# Simulate system once
reward = 0.
for i_step in range(max_steps):
    env.render()
    
    x_new, r, t, _ = env.step(u[i_step,:], full_state=full_state) 
    
    model.add(0, i_step, x, u[i_step,:], x_new)
    
    x = x_new    
    reward += r
print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (0, i_step+1, reward, reward/i_step))

# Only use first N control inputs for iLQG estimator
u = u[:N,:]
    
for i_episode in range(1, num_episodes):
    # Fit models
    model.fit()
    
    x = env.reset(x0, goal, full_state=full_state)
    terminal = False
    i_step = 0
    reward = 0.
    
    for i_step in range(max_steps):
        env.render()
        
        model.set_cur_step(i_step)

        _, u, L, Vx, Vxx, cost = ilqg(model.dynamics_func, env.cost_func, x, u, {})
                
        # Take step
        x_new, r, t, _ = env.step(u[0, :], full_state=full_state)

        # Add to data matrices
        model.add(i_episode, i_step, x, u[0, :], x_new)
        
        u = np.concatenate((u[1:,:], np.random.randn(1, env.action_dim))) 
        
        x = x_new
        reward += r
        i_step += 1
        
        if t:
            break
    
    print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (i_episode, i_step, reward, reward/i_step))


Iter 0, Steps 75, Reward: -1436.63, Average reward: -19.41
Iter 1, Steps 75, Reward: -998.66, Average reward: -13.32
Iter 2, Steps 75, Reward: -823.61, Average reward: -10.98
Iter 3, Steps 75, Reward: -694.03, Average reward: -9.25
Iter 4, Steps 75, Reward: -721.71, Average reward: -9.62
Iter 5, Steps 75, Reward: -702.38, Average reward: -9.37
Iter 6, Steps 75, Reward: -714.97, Average reward: -9.53
Iter 7, Steps 75, Reward: -750.33, Average reward: -10.00
Iter 8, Steps 75, Reward: -703.63, Average reward: -9.38
Iter 9, Steps 75, Reward: -702.08, Average reward: -9.36
Iter 10, Steps 75, Reward: -681.44, Average reward: -9.09
Iter 11, Steps 75, Reward: -689.65, Average reward: -9.20
Iter 12, Steps 75, Reward: -690.33, Average reward: -9.20
Iter 13, Steps 75, Reward: -691.93, Average reward: -9.23
Iter 14, Steps 75, Reward: -681.74, Average reward: -9.09
Iter 15, Steps 75, Reward: -689.35, Average reward: -9.19
Iter 16, Steps 75, Reward: -696.37, Average reward: -9.28
Iter 17, Steps 75, Reward: -698.86, Average reward: -9.32
Iter 18, Steps 75, Reward: -690.55, Average reward: -9.21
Iter 19, Steps 75, Reward: -693.54, Average reward: -9.25
Iter 20, Steps 75, Reward: -691.59, Average reward: -9.22
Iter 21, Steps 75, Reward: -692.29, Average reward: -9.23
Iter 22, Steps 75, Reward: -690.22, Average reward: -9.20
Iter 23, Steps 75, Reward: -690.10, Average reward: -9.20
Iter 24, Steps 75, Reward: -689.38, Average reward: -9.19

In [5]:
env.render(close=True)